import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly
plotly.offline.init_notebook_mode()
def create_dict(keys, values):
'''
create dictionary from two lists
'''
dictionary = dict(zip(keys, values))
return dictionary
def grouping_pop(df, level, start_rename):
'''
creating summary tables with mean, std and n values per group defined by level
'''
df_mean = df.groupby(level).mean().reset_index().dropna(axis = 1)
temp_dict = [create_dict(df_mean.columns[start_rename::], df_mean.columns[start_rename::]+ i) for i in ['_mean', '_std']]
df_mean = df_mean.rename(columns = temp_dict[0])
df_std = df.groupby(level).std().reset_index().dropna(axis = 1, thresh=5)
df_std= df_std.rename(columns = temp_dict[1])
df_n = df.groupby(level).size().reset_index(name='counts')
return df_mean, df_std, df_n
def plot_heatmap(cor, mode, ann=False):
'''
plot heatmap from a correlation matrix, 2 modes avalaible
abs: compute the absolute values of the correlation matrix (values bounded between [0-1])
cor: raw correlation values (values bounded between [-1-1])
'''
assert mode in ['cor','abs'], 'mode should be one of ["cor","abs"]'
if mode == 'cor':
vm = -1
else :
vm = 0
cor = np.abs(cor)
fig, ax = plt.subplots(figsize=(15, 15))# mask
mask = np.triu(np.ones_like(cor, dtype=np.bool))# adjust mask and df
mask = mask[1:, :-1]
corr = cor.iloc[1:,:-1].copy()# plot heatmap
sns.heatmap(corr, mask=mask, annot=ann, fmt=".2f", cmap='Blues',
vmin=vm, vmax=1, cbar_kws={"shrink": .8})# yticks
plt.yticks(rotation=0)
plt.show()
def mean_confidence_interval(sm, n, confidence=0.95, verbose = False):
'''
compute the delta to mean value for computing confidence interval
sm : standard deviation of the mean
n : number of individuals
return the delta to mean value for computing confidence interval (to be added to the mean value)
'''
def compute_t(confidence, ni):
t=stats.t.ppf((1 + confidence) / 2., ni-1)
return t
tval = [compute_t(confidence, ni) for ni in n]
n = np.array(n)
sm = np.array(sm)
tval = np.array(tval)
h = np.array(sm)/np.sqrt(n) * np.array(tval)
if verbose:
print('t values : {}'.format(np.around(tval, decimals=2)))
print('\nse values : {}'.format(np.around(sm, decimals=2)))
print('\nci values : {}'.format(np.around(h, decimals=2)))
return h
class MyPCA():
'''
file:///home/xavier/Downloads/fr_Tanagra_ACP_Python.pdf
'''
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
def __init__(self, df):
self.df = df
self.n = df.shape[0]
self.p = df.shape[1]
def standardize(self):
sc = StandardScaler()
self.Z = sc.fit_transform(self.df)
def dopca(self):
try:
self.Z
except:
print("Z is not defined please standardize before")
self.acp = PCA(svd_solver='full')
self.coord = self.acp.fit_transform(self.Z)
self.eigval = self.acp.explained_variance_
def assess_pca(self):
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(8,6))
ax1.bar(np.arange(1,self.p +1),self.acp.explained_variance_)
ax1.set_title("Scree plot")
ax1.set_ylabel("Eigen values")
ax1.set_xlabel("Factor number")
ax2.bar(np.arange(1,self.p+1),np.cumsum(self.acp.explained_variance_ratio_))
ax2.set_title("Explained variance vs. # of factors")
ax2.set_ylabel("Cumsum explained variance ratio")
ax2.set_xlabel("Factor number")
plt.show()
def plot_indiv(self, label):
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-6,6) #même limites en abscisse
axes.set_ylim(-6,6) #et en ordonnée
#placement des étiquettes des observations
assert self.n == label.shape[0], 'rows number should have the same length than label'
for i in range(self.n):
plt.annotate(label[i],(self.coord[i,0],self.coord[i,1]))
#ajouter les axes
plt.plot([-6,6],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-6,6],color='silver',linestyle='-',linewidth=1)#affichage
plt.show()
def _compute_corvar(self):
print('computing factor variable correlation')
sqrt_eigval = np.sqrt(self.eigval)
#corrélation des variables avec les axes
self.corvar = np.zeros((self.p,self.p))
for k in range(self.p):
self.corvar[:,k] = self.acp.components_[k,:] * sqrt_eigval[k]
#afficher la matrice des corrélations variables x facteurs
def plot_features(self, label):
self._compute_corvar()
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
#affichage des étiquettes (noms des variables)
assert self.p == label.shape[0], 'cols number should have the same length than label'
for j in range(self.p):
plt.annotate(label[j],(self.corvar[j,0],self.corvar[j,1]))
#ajouter les axes
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)
#affichage
plt.show()
def compute_cos2(self):
try:
self.corvar
except:
print("corvar is not defined, use plot_features before")
self.cos2var = self.corvar**2
print('Axis 1\n---------------------------------------------\n')
print(pd.DataFrame({'id':self.df.columns,'COS2_1':self.cos2var[:,0],'COS2_2':self.cos2var[:,1]}).sort_values('COS2_1', ascending = False))
print('Axis 2\n---------------------------------------------\n')
print(pd.DataFrame({'id':self.df.columns,'COS2_1':self.cos2var[:,0],'COS2_2':self.cos2var[:,1]}).sort_values('COS2_2', ascending = False))

# import df
df = pd.read_table("/home/xavier/Documents/research/FORMANRISK/data/data_formanrisk/individual_join.csv", sep = ";")
# remove few columns
df = df.drop(columns = ["X","Y",'X_TYPE_', 'X_FREQ_', 'individual', 'branch_diam', 'branch_diamn','num',
'P50n','P12n','P88n','slopen','Kmaxn'])
print('dimensions of df are \nnrows : {0}\nncols : {1}'.format(df.shape[0], df.shape[1]))
#Â remove the _15 from bioclim var
df.columns = [re.sub("_15", "", c) for c in df.columns]
# extracting index of bioclim var
bio_index = [i for i, item in enumerate(df.columns) if re.search('bio\d{1,2}', item)]
# renaming bioclim var with meaningful names
keys = ["bio1" ,"bio2" ,"bio3" ,"bio4" ,"bio5" ,"bio6" ,"bio7" ,"bio8" ,"bio9" ,"bio10" ,"bio11" ,"bio12" ,"bio13" ,"bio14" ,"bio15" ,"bio16" ,"bio17" ,"bio18" ,"bio19"]
values = ["Tmean_annual" ,"Mean_D_range" ,"Isothermality" ,"T_seasonality" ,"Tmax_warmerM" ,"Tmin_coldestM" ,"T_annual_range" ,"Tmean_wettestQ" ,"Tmean_driestQ" ,"Tmean_warmerQ" ,"Tmean_coldestQ" ,"P_annual" ,"P_wettestM" ,"P_driestM" ,"P_seasonality" ,"P_wettestQ" ,"P_driestQ" ,"P_warmestQ" ,"P_coldestQ"]
dictionary = create_dict(keys,values)
df = df.rename(columns = dictionary)
# creating summary tables with mean, std and n values per group defined by level
df_pop_mean,df_pop_std,df_pop_n = grouping_pop(df=df, level=['Species','site'], start_rename=2)
# extracting labels of columns of interest
label_num = df_pop_mean.iloc[:,2::].columns
# concat mean, std and n summary tables
df_pop_mean = pd.concat([df_pop_mean,df_pop_std,df_pop_n], axis = 1)
# remove duplicated columns
df_pop_mean =df_pop_mean.loc[:,~df_pop_mean.columns.duplicated()]
# creating summary tables with mean, std and n values per group defined by level
df_pop_mean_T,df_pop_std_T ,df_pop_n_T = grouping_pop(df=df, level=['Species','site','Treatment'], start_rename=3)
# concat mean, std and n summary tables
df_pop_mean_T = pd.concat([df_pop_mean_T ,df_pop_std_T ,df_pop_n_T ], axis = 1)
# remove duplicated columns
df_pop_mean_T =df_pop_mean_T.loc[:,~df_pop_mean_T.columns.duplicated()]
# keeping only p pinaster pop
df_pp = df[df.Species == "pinus pinaster"]
df_mean_pp = df_pop_mean[df_pop_mean.Species == "pinus pinaster"]
df_mean_pp_T = df_pop_mean_T[df_pop_mean_T.Species == "pinus pinaster"]
fig = px.bar(df_mean_pp_T, x='site', y='counts',
hover_data=['Treatment','counts'], color='Treatment',
labels={'counts':'Nb indivs per site per Treatment (P. pinaster)'},
height=400, barmode='group')
fig.show()
Data are cleaned, let's see now for some data exploration
fig = px.histogram(df_pp, x="P50", color = 'Treatment', marginal="rug",
hover_data=df_pp.columns)
fig.show()
fig = px.box(df_pp, x="site", y="P50", color = 'Treatment')
fig.show()
mci = mean_confidence_interval(sm = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_std"],
n = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "counts"],
verbose = False)
fig = px.scatter(x=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='young', "P50_mean"],
y=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_mean"],
trendline="ols",
error_y=mci,
error_y_minus=mci,
labels={
"x": "P50 mean young",
"y": "P50 Mean adult"
},
title="P50 of adults vs P50 pof youngs per population with 95% confidence interval of the mean")
fig.show()
results = px.get_trendline_results(fig)
print('Statistics summary\n-------------------------\n')
results.px_fit_results.iloc[0].summary()
There is a significant correlation between young and adult P50 (R² = 0.54, F=15.24, p = 0.0018) The estimated value of the slope is 0.69 (t = 3.9, p = 0.002)
# mean values per population
df_corr= df_mean_pp[label_num].corr()
plot_heatmap(cor=df_corr, mode='abs', ann=True)
Strongest correlation between P50 and:
import plotly.express as px
fig = px.scatter(df_pp, x="Tmean_annual", y="P50", color="Treatment", trendline="ols", title = 'P50 vs Tmean')
fig.show()
results = px.get_trendline_results(fig)
print(results)
# results.query("Treatment == 'adult'").px_fit_results.iloc[0].summary()
# results.query("Treatment == 'young'").px_fit_results.iloc[0].summary()
Very weak R² between individuals measure of P50 and T mean annual of the pop (R² = 0.02 and 0.06)
Without the Treatment effect
df_mean_pp_wide = df_mean_pp[["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"P50_mean",
'site']]
df_mean_pp_wide = pd.melt(df_mean_pp_wide,
id_vars=[
'site',
"P50_mean"],
value_vars=["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean'
])
df_mean_pp_wide.columns
fig = px.scatter(df_mean_pp_wide,
x="value",
y="P50_mean",
trendline="ols",
text = "site",
facet_col="variable",
facet_col_wrap=3,
facet_row_spacing=0.04, # default is 0.07 when facet_col_wrap is used
facet_col_spacing=0.04, # default is 0.03
height=800, width=1000)
fig.update_traces(textposition='top center')
fig.update_xaxes(matches=None)
fig.show()
NB : Oin (es, fr & P) is a common garden located in corogna (spain) with 3 provenances (spanish, portugese (Leiria) and Frecnh)
With the Treament effect
df_mean_pp_T_wide = df_mean_pp_T[["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"P50_mean",
"Treatment",
'site']]
df_mean_pp_T_wide = pd.melt(df_mean_pp_T_wide,
id_vars=["Treatment",
'site',
"P50_mean"],
value_vars=["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean'
])
df_mean_pp_T_wide.columns
fig = px.scatter(df_mean_pp_T_wide,
x="value",
y="P50_mean",
color="Treatment",
trendline="ols",
text = "site",
facet_col="variable",
facet_col_wrap=3,
facet_row_spacing=0.04, # default is 0.07 when facet_col_wrap is used
facet_col_spacing=0.04, # default is 0.03
height=800, width=1000)
fig.update_traces(textposition='top center')
fig.update_xaxes(matches=None)
fig.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Tmean_coldestQ_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
Results:
Some traits some to be more correlated to p50 as seen on the heatmap, with maybe some slight differences between treatment
mypca = MyPCA(df_mean_pp_T[[v +'_mean' for v in values]] )
mypca.standardize()
mypca.dopca()
mypca.assess_pca()
mypca.plot_indiv(label = df_mean_pp_T.site)
mypca.plot_features(label = df_mean_pp_T[[v +'_mean' for v in values]].columns)
mypca.compute_cos2()
acp_coord = pd.DataFrame(mypca.coord, columns = ['acp_'+str(i) for i in np.arange(0,mypca.coord.shape[1])])
df_mean_pp_T_acp = pd.concat([df_mean_pp_T, acp_coord], axis = 1)
fig = px.scatter(df_mean_pp_T_acp, x="acp_0", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
fig = px.scatter(df_mean_pp_T_acp, x="acp_1", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
It seems that the correlation is better between p50 and the second axis of the PCA (R² = 0.23, 0.26) but it is not super wonderful, maybe fit a non linear model
first axis is associated with :
second axis is associated with :
mypca = MyPCA(df_pp.iloc[:,bio_index] )
mypca.standardize()
mypca.dopca()
mypca.assess_pca()
mypca.plot_indiv(label = df_pp.site)
acp_coord = pd.DataFrame(mypca.coord, columns = ['acp_'+str(i) for i in np.arange(0,mypca.coord.shape[1])])
df_pp_acp = pd.concat([df_pp, acp_coord], axis = 1)
df_pp_acp
if False:
df_mean_pp_T_acp.to_csv("/home/xavier/Documents/research/FORMANRISK/analyse/forman_cavit/output/table/df_mean_PP.csv")
df_pp_acp.to_csv("/home/xavier/Documents/research/FORMANRISK/analyse/forman_cavit/output/table/df_PP.csv")
# Import the linear regression model class
from pymer4.models import Lm
# Initialize model using 2 predictors and sample data
model = Lm("P50 ~ Treatment + site", data=df_pp_acp)
# Fit it
print(model.fit())
df_restricted = df_pp_acp[['P50','site', 'Treatment', 'acp_0', 'acp_1']]
from pymer4.models import Lmer
# Initialize model instance using 1 predictor with random intercepts
# this is the full model
model = Lmer("P50 ~ Treatment + acp_0 + acp_1 + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
# Initialize model instance using 1 predictor with random intercepts
# no climatic variables
model = Lmer("P50 ~ Treatment + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
# Initialize model instance using 1 predictor with random intercepts
model = Lmer("P50 ~ Treatment + acp_1 + (acp_1|site)", data=df_restricted)
# Fit it
print(model.fit())
# Initialize model instance using 1 predictor with random intercepts and slopes
model = Lmer("P50 ~ Treatment + acp_1 + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
This is the best model based on AIC value (both models random intercep & random intercept + slope perform very similarly)
No effect of treatment effect on acp_1 which is the second axis of the pca correlated with :
# Visualize coefficients with group/cluster fits overlaid ("forest plot")
model.plot_summary()
model.plot("acp_1", plot_ci=True, ylabel="predicted DV")
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
base = importr('base')
stats = importr('stats')
graphics = importr('graphics')
utils = importr('utils')
ade4 = importr('ade4')
nlme = importr('nlme')
lme4 = importr('lme4')
lmertest = importr('lmerTest')
from rpy2.robjects.conversion import localconverter
import rpy2.ipython.html
rpy2.ipython.html.init_printing()
with localconverter(ro.default_converter + pandas2ri.converter):
df_restricted_r = ro.conversion.py2rpy(df_restricted)
base.dim(df_restricted_r)
from rpy2.robjects.packages import importr
utils = importr('utils')
%load_ext rpy2.ipython
%%R -i df_restricted_r
require(tidyverse)
require(dplyr)
require(lme4)
require(nlme)
require(lmerTest)
# glimpse(df_restricted_r)
%%R
df_restricted_r[,'Treatment'] = as.factor(df_restricted_r[,'Treatment'] )
df_restricted_r[,'site'] = as.factor(df_restricted_r[,'site'] )
mm1 = lmer(P50 ~ Treatment + acp_1 + (1|site), data = df_restricted_r)
summary(mm1)
%%R
hist(residuals(mm1))
qqnorm(residuals(mm1))
qqline(residuals(mm1))
plot(fitted(mm1)~residuals(mm1))
%%R
plot(residuals(mm1)~as.factor(df_restricted_r[,'Treatment']))
%%R
plot(residuals(mm1)~df_restricted_r[,'acp_0'])
plot(residuals(mm1)~df_restricted_r[,'acp_1'])
import pyjags
import numpy as np
np.random.seed(0)
np.set_printoptions(precision=3)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
treat_le = le.fit_transform(df_restricted.Treatment)+1
site_le = le.fit_transform(df_restricted.site)+1
code = '''
model {
for (i in 1:N) {
y[i] ~ dnorm(a0 + a1[Treatment[i]] + a2*Covar[i] + A3[Site[i]] , tau)
}
a0~dnorm(-3.8, 1e-4)
tau ~ dgamma(1e-4, 1e-4)
a2~dnorm(0, 1e-4)
for(t in 1:2){
a1[t]~dnorm(0.0, 1e-4)
}
for(s in 1:NSITE){
A3[s]~dnorm(0.0, tauS)
}
tauS ~ dgamma(1e-4, 1e-4)
}
'''
model = pyjags.Model(code, data=dict(Treatment=treat_le,
Covar=df_restricted.acp_1,
Site=site_le,
y = df_restricted.P50,
N=df_restricted.shape[0],
NSITE=df_restricted.site.unique().shape[0]),
chains=4,
threads=4,
chains_per_thread=1)
var = ['a0', 'a1', 'a2', 'A3', 'tauS']
# warmup / burn-in iterations, not used for inference.
model.sample(10000, vars=[])
samples = model.sample(10000,
vars=var,
thin = 10)
def summary(samples, varname, p=95):
values = samples[varname]
ci = np.percentile(values, [100-p, p])
print('{:<6} median = {:>5.3f}, {}% credible interval [{:>4.3f} {:>4.3f}]'.format(
varname, np.median(values), p, *ci))
for varname in var:
summary(samples, varname)
Formula: P50~Treatment+acp_1+(1|site)
Family: gaussian Inference: parametric
Number of observations: 406 Groups: {'site': 15.0}
Log-likelihood: -90.438 AIC: 180.877
Random effects:
Name Var Std
site (Intercept) 0.012 0.108 Residual 0.083 0.289
No random effect correlations specified
Fixed effects:
Estimate 2.5_ci 97.5_ci SE DF T-stat P-val Sig
(Intercept) -3.833 -3.900 -3.765 0.034 18.521 -111.211 0.000 *
Treatmentyoung 0.035 -0.022 0.091 0.029 391.798 1.195 0.233
acp_1 -0.036 -0.060 -0.013 0.012 13.501 -3.038 0.009
plt.hist(np.ravel(samples['a0']))